In [74]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
sns.set(style='whitegrid')
In [2]:
path = os.path.join('Data storage', 'Facility gen fuels and CO2 2017-05-25.zip')
facility_df = pd.read_csv(path, parse_dates=['datetime'])
In [3]:
facility_df.head()
Out[3]:
In [4]:
facility_df.dropna(inplace=True, subset=['lat', 'lon'])
In [5]:
cols = ['lat', 'lon', 'plant id', 'year']
small_facility = facility_df.loc[:, cols].drop_duplicates()
In [6]:
geometry = [Point(xy) for xy in zip(small_facility.lon, small_facility.lat)]
# small_facility = small_facility.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(small_facility, crs=crs, geometry=geometry)
In [7]:
path = os.path.join('Data storage', 'NERC_Regions_EIA', 'NercRegions_201610.shp')
regions = gpd.read_file(path)
In [8]:
facility_nerc = gpd.sjoin(geo_df, regions, how='inner', op='within')
In [9]:
facility_nerc.head()
Out[9]:
In [10]:
cols = ['plant id', 'year', 'NERC']
facility_df = facility_df.merge(facility_nerc.loc[:, cols],
on=['plant id', 'year'], how='left')
In [11]:
facility_df.head()
Out[11]:
Filter out data older than 2014 to reduce size
In [12]:
facility_df['state'] = facility_df['geography'].str[-2:]
keep_cols = ['fuel', 'year', 'month', 'datetime', 'state', 'plant id', 'NERC',
'generation (MWh)', 'total fuel (mmbtu)', 'elec fuel (mmbtu)']
facility_df = facility_df.loc[facility_df['year'] >= 2014, keep_cols]
In [13]:
facility_fuel_cats = {'COW': ['SUB', 'BIT', 'LIG', 'WC', 'SC', 'RC', 'SGC'],
'NG': ['NG'],
'PEL': ['DFO', 'RFO', 'KER', 'JF',
'PG', 'WO', 'SGP'],
'PC': ['PC'],
'HYC': ['WAT'],
'HPS': [],
'GEO': ['GEO'],
'NUC': ['NUC'],
'OOG': ['BFG', 'OG', 'LFG'],
'OTH': ['OTH', 'MSN', 'MSW', 'PUR', 'TDF', 'WH'],
'SUN': ['SUN'],
'DPV': [],
'WAS': ['OBL', 'OBS', 'OBG', 'MSB', 'SLW'],
'WND': ['WND'],
'WWW': ['WDL', 'WDS', 'AB', 'BLQ']
}
In [14]:
for category in facility_fuel_cats.keys():
fuels = facility_fuel_cats[category]
facility_df.loc[facility_df['fuel'].isin(fuels),
'fuel category'] = category
In [15]:
facility_df.head()
Out[15]:
In [16]:
facility_df.dtypes
Out[16]:
In [17]:
facility_df.loc[facility_df['NERC'].isnull(), 'state'].unique()
Out[17]:
In [26]:
folder = os.path.join('Data storage', 'Derived data', 'state gen data')
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE",
"FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS",
"KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
"MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
"NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
In [29]:
state_list = []
for state in states:
path = os.path.join(folder, '{} fuels gen.csv'.format(state))
df = pd.read_csv(path, parse_dates=['datetime'])
state_list.append(df)
state_df = pd.concat(state_list)
state_df.reset_index(inplace=True, drop=True)
In [30]:
state_df.dtypes
Out[30]:
In [31]:
state_df['state'] = state_df['geography'].str[-2:]
keep_cols = ['state', 'type', 'year', 'datetime', 'generation (MWh)',
'elec fuel (mmbtu)']
fuel_cats = facility_fuel_cats.keys()
state_df = state_df.loc[(state_df['year'] >= 2014) &
(state_df['type'].isin(fuel_cats)), keep_cols]
In [32]:
state_df['type'].unique()
Out[32]:
In [33]:
annual_facility = facility_df.groupby(['year', 'state', 'fuel category']).sum()
# annual_facility.reset_index(inplace=True)
annual_facility.drop('plant id', axis=1, inplace=True)
In [34]:
annual_facility.head()
Out[34]:
In [35]:
annual_state = state_df.groupby(['year', 'state', 'type']).sum()
# annual_state.reset_index(inplace=True)
In [36]:
annual_state.head(n=25)
Out[36]:
It's interesting that the facility data has fuel consumption for solar generation and the state data doesn't. Looking at a 923 data file, it's clear that the fuel consumption is just based on a conversion efficiency of 36.6% across all facilities.
In [37]:
annual_state.loc[2016, 'CA', 'SUN']
Out[37]:
In [38]:
annual_facility.loc[2016, 'CA', 'SUN']
Out[38]:
How much generation from large sources (Hydro, wind, coal, natural gas, and nuclear) is missed by monthly 923 data?
In [40]:
for fuel in ['HYC', 'WND', 'COW', 'NG', 'NUC', 'SUN']:
state_total = annual_state.loc[2016, :, fuel]['generation (MWh)'].sum()
facility_total = annual_facility.loc[2016, :, fuel]['generation (MWh)'].sum()
error = (state_total - facility_total) / state_total
print('{} has an error of {:.2f}%'.format(fuel, error * 100))
In [18]:
path = os.path.join('Data storage', 'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx')
frequency = pd.read_excel(path, sheetname='Page 6 Plant Frame', header=4)
In [19]:
frequency.head()
Out[19]:
In [20]:
frequency.rename(columns={'Plant Id': 'plant id',
'Plant State': 'state',
'YEAR': 'year',
'Reporting\nFrequency': 'Reporting Frequency'}, inplace=True)
In [99]:
frequency.head()
Out[99]:
In [21]:
frequency.dtypes
Out[21]:
Make a dataframe with generation, fuel consumption, and reporting frequency of facilities in 2015
In [22]:
freq_cols = ['year', 'plant id', 'Reporting Frequency']
df = pd.merge(facility_df, frequency.loc[:, freq_cols], on=['year', 'plant id'])
In [80]:
df.head()
Out[80]:
In [78]:
g = sns.factorplot(x='fuel category', y='generation (MWh)', hue='Reporting Frequency',
col='NERC', col_wrap=3, data=df, estimator=np.sum, ci=0, kind='bar',
palette='tab10')
g.set_xticklabels(rotation=30)
Out[78]:
Number of NERC regions in a state
In [24]:
df.loc[df['state'] == 'TX', 'NERC'].nunique()
Out[24]:
This is development of a method that will be used to approximate the fraction of EIA-estimated generation and consumption within each state that gets apportioned to each NERC regions (when there is more than one). The idea is to take data from the most recent "final" EIA-923 and use the annual reporting facilities to approximate the divisions for more recent data. I still need to figure out if it's better to do the calculation by month within a year or just for the year as a whole.
Determining if it's better to do month-by-month vs a single value for the whole year will depend on if the share of generation/consumption from Annual reporting facilities in each NERC changes much over the course of the year. There is the potential for error either way, and maybe even differences by state. Annual is certainly simpler.
While looking at data for Texas, I've discovered that generation from Annual reporting facilities can be negative. Need to figure out how (if?) to deal with this...
While there can be variation of % generation in each NERC within a state over the course of 2015, most fuel categories across most states are quite stable. And when fuels do a have a wide spread over the year, they also tend to not be a large fraction of total generation within the NERC region. Given these observations, I'm going to stick with a split calculated as the average over an entire year.
In [81]:
def annual(df, state):
"""Return the percent of gen & consumption by fuel type in each NERC region
for a state"""
a = df.loc[(df.state == state) &
(df['Reporting Frequency'] == 'A')].copy()
a.drop(['plant id', 'year'], axis=1, inplace=True)
a = a.groupby(['NERC', 'fuel category']).sum()
fuels = set(a.index.get_level_values('fuel category'))
temp_list = []
for fuel in fuels:
temp = (a.xs(fuel, level='fuel category')
/ a.xs(fuel, level='fuel category').sum())
temp['fuel category'] = fuel
temp_list.append(temp)
result = pd.concat(temp_list)
result.reset_index(inplace=True)
result['state'] = state
rename_cols = {'generation (MWh)': '% generation',
'total fuel (mmbtu)': '% total fuel',
'elec fuel (mmbtu)': '% elec fuel'}
result.rename(columns=rename_cols, inplace=True)
return result
In [30]:
def annual_month(df, state):
"""Return the percent of gen & consumption by fuel type and month in each
NERC region for a state"""
a = df.loc[(df.state == state) &
(df['Reporting Frequency'] == 'A')].copy()
a.drop(['plant id', 'year'], axis=1, inplace=True)
a = a.groupby(['NERC', 'fuel category', 'month']).sum()
fuels = set(a.index.get_level_values('fuel category'))
temp_list = []
for fuel in fuels:
for month in range(1, 13):
temp = (a.xs(fuel, level='fuel category')
.xs(month, level='month')
/ a.xs(fuel, level='fuel category')
.xs(month, level='month')
.sum())
temp['fuel category'] = fuel
temp['month'] = month
temp_list.append(temp)
result = pd.concat(temp_list)
result.reset_index(inplace=True)
result['state'] = state
rename_cols = {'generation (MWh)': '% generation',
'total fuel (mmbtu)': '% total fuel',
'elec fuel (mmbtu)': '% elec fuel'}
result.rename(columns=rename_cols, inplace=True)
return result
This is the percent of generation, total fuel consumption, and electric fuel consumption from facilities that report annually to EIA-923
In [82]:
df_list = []
for state in states:
num_nerc = df.loc[df.state == state, 'NERC'].nunique()
if num_nerc > 1:
df_list.append(annual(df, state))
In [31]:
df_list = []
for state in states:
num_nerc = df.loc[df.state == state, 'NERC'].nunique()
if num_nerc > 1:
df_list.append(annual_month(df, state))
In [32]:
fuel_by_nerc_month = pd.concat(df_list).reset_index(drop=True)
In [83]:
fuel_by_nerc = pd.concat(df_list).reset_index(drop=True)
In [84]:
fuel_by_nerc.head()
Out[84]:
In [214]:
fuel_by_nerc_month.tail()
Out[214]:
In [ ]:
st
In [27]:
split_states = []
for state in states:
if df.loc[df.state == state, 'NERC'].nunique() > 1:
split_states.append(state)
In [28]:
split_states
Out[28]:
In [56]:
cols = ['state', 'NERC', 'fuel category']
a = fuel_by_nerc_month.groupby(cols).std()
a.drop('month', axis=1, inplace=True)
In [63]:
a.xs('AR', level='state')
Out[63]:
In [70]:
a[a > .1].dropna(how='all')
Out[70]:
In [75]:
fuels = ['NG', 'HYC', 'COW', 'GEO', 'WND', 'SUN']
sns.factorplot(x='month', y='% generation', hue='fuel category', col='NERC',
row='state',
data=fuel_by_nerc_month.loc[(fuel_by_nerc_month['fuel category'].isin(fuels)) &
(fuel_by_nerc_month['NERC'] != '-')],
n_boot=1)
path = os.path.join('Figures', 'SI', 'Annual facility seasonal gen variation.pdf')
# plt.savefig(path, bbox_inches='tight')
In [222]:
fuel_by_nerc_month.loc[(fuel_by_nerc_month.state=='TX') &
(fuel_by_nerc_month['fuel category'] == 'WWW')]
Out[222]:
In [232]:
df.loc[(df.state == 'TX') &
(df['fuel category'] == 'WWW') &
(df['Reporting Frequency'] == 'A')].groupby(['NERC', 'month', 'fuel category']).sum()
Out[232]:
In [230]:
df.loc[(df.state == 'TX') &
(df['fuel category'] == 'WWW') &
(df['Reporting Frequency'] == 'A')].groupby(['NERC', 'fuel category']).sum()
Out[230]:
In [79]:
NERC_states = ['WY', 'SD', 'NE', 'OK', 'TX', 'NM', 'LA', 'AR',
'MO', 'MN', 'IL', 'KY', 'VA', 'FL']
In [93]:
error_list = []
for state in NERC_states:
error = (annual_state.loc[2016, state]
- annual_facility.loc[2016, state]) / annual_state.loc[2016, state]
error['state'] = state
for col in ['generation (MWh)']:#, 'elec fuel (mmbtu)']:
if error.loc[error[col] > 0.05, col].any():
error_list.append(error.loc[error[col] > 0.05])
The dataframe below shows all states with more than one NERC region where facility generation is at least 5% below EIA's state-level estimate in 2016.
In [94]:
pd.concat(error_list)
Out[94]: